Data cleaning
path = "data/2017"
temp <- list.files(path=path)
data1 <- read_excel(paste(path,temp[1], sep="/"), sheet = "Student", skip = 1)
data2 <- read_csv(paste(path,temp[2], sep="/"))
tag <- data2 %>%
select(`ID`, `UoA ID`, `UoA UPI`, `Tags`)
head(tag,5)
## # A tibble: 5 x 4
## ID `UoA ID` `UoA UPI` Tags
## <int> <chr> <chr> <chr>
## 1 1.80e8 EXTERNAL EXTERNAL CIE Newsletter,2016 MCE Student,2016 CIE Pa~
## 2 1.80e8 160160291 CSHE674 CIE Newsletter,2016 MCE Student,2016 CIE Pa~
## 3 1.80e8 2667562 ADEK841 Unleash Space,2017 Create and Make Workshop~
## 4 1.80e8 4099538 EERS001 CIE Newsletter,2016 MCE Student,2016 CIE Pa~
## 5 1.80e8 4517867 COSU006 CIE Newsletter,2016 MCE Student,Weekly News~
# Participant programme
partProg <- tag$Tags %>%
strsplit(., ",") %>%
setNames(tag$`UoA ID`) %>%
melt(value.name = "programme")
colnames(partProg) <- c("programme", "ID")
head(partProg,5)
## programme ID
## 1 CIE Newsletter EXTERNAL
## 2 2016 MCE Student EXTERNAL
## 3 2016 CIE Participant EXTERNAL
## 4 MCE Current Student and Alumni EXTERNAL
## 5 2017 CIE Participant EXTERNAL
#TODO: Active in Programme?
partInfo<- data1 %>%
select(`ID`, `Acad Prog`, `Status`, `Descriptio`, `Acad Plan`, `Plan Description`, `Owner of Major/Spec/Module`)
head(partInfo,5)
## # A tibble: 5 x 7
## ID `Acad Prog` Status Descriptio `Acad Plan` `Plan Descripti~
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 5695~ BABS Activ~ Bachelor ~ PSYC-BSBA Psychology (BSc)
## 2 2198~ BCOM Activ~ Bachelor ~ INEN-BCOM Innovation & En~
## 3 5129~ BSC Activ~ Bachelor ~ BIOMED-BSC Biomedical Scie~
## 4 8075~ BA Activ~ Bachelor ~ POIR-BAMAJ Politics and In~
## 5 9675~ BCOM Activ~ Bachelor ~ MGT-BCOM Management
## # ... with 1 more variable: `Owner of Major/Spec/Module` <chr>
# Outer join two tables
df <- merge(x=partProg, y=partInfo, by="ID", all.x = TRUE)
head(df,5)
## ID programme
## 1 1000280 2017 Velocity Participant
## 2 1000280 2017 CIE Participant
## 3 1000280 2017 Velocity Innovation Challenge Participant
## 4 1000280 Velocity
## 5 1000280 2017 Velocity Innovation Challenge Commercial Participant
## Acad Prog Status Descriptio Acad Plan Plan Description
## 1 <NA> <NA> <NA> <NA> <NA>
## 2 <NA> <NA> <NA> <NA> <NA>
## 3 <NA> <NA> <NA> <NA> <NA>
## 4 <NA> <NA> <NA> <NA> <NA>
## 5 <NA> <NA> <NA> <NA> <NA>
## Owner of Major/Spec/Module
## 1 <NA>
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
# Filter out non-students
df_stud <- df %>%
filter(!is.na(`Acad Prog`))
head(df_stud,10)
## ID
## 1 101253805
## 2 101253805
## 3 101253805
## 4 101253805
## 5 101253805
## 6 101253805
## 7 101253805
## 8 101253805
## 9 101253805
## 10 101253805
## programme
## 1 2018 Create and Make International Women's Day Maker Workshop Participant
## 2 2018 Velocity $100k Challenge Participant
## 3 2017 Create and Make Workshop Participant
## 4 2018 CIE Participant
## 5 2018 Create and Make Workshop Participant
## 6 2017 Unleash Space Participant
## 7 2018 Idea Bootcamp Participant
## 8 2018 Unleash Space International Women's Day Lunch
## 9 Velocity Participant
## 10 2018 Velocity I2B Participant
## Acad Prog Status Descriptio Acad Plan
## 1 COPUA Active in Programme Certificate of Proficiency SCI-COPUA
## 2 COPUA Active in Programme Certificate of Proficiency SCI-COPUA
## 3 COPUA Active in Programme Certificate of Proficiency SCI-COPUA
## 4 COPUA Active in Programme Certificate of Proficiency SCI-COPUA
## 5 COPUA Active in Programme Certificate of Proficiency SCI-COPUA
## 6 COPUA Active in Programme Certificate of Proficiency SCI-COPUA
## 7 COPUA Active in Programme Certificate of Proficiency SCI-COPUA
## 8 COPUA Active in Programme Certificate of Proficiency SCI-COPUA
## 9 COPUA Active in Programme Certificate of Proficiency SCI-COPUA
## 10 COPUA Active in Programme Certificate of Proficiency SCI-COPUA
## Plan Description Owner of Major/Spec/Module
## 1 Science Science
## 2 Science Science
## 3 Science Science
## 4 Science Science
## 5 Science Science
## 6 Science Science
## 7 Science Science
## 8 Science Science
## 9 Science Science
## 10 Science Science
# Only select specific year
df_stud <- df_stud %>%
mutate(year=substring(`programme`,0,4), programme=substring(`programme`,6)) %>%
filter(year=="2017")
head(df_stud,5)
## ID programme Acad Prog
## 1 101253805 Create and Make Workshop Participant COPUA
## 2 101253805 Unleash Space Participant COPUA
## 3 101253805 Unleash Space Access COPUA
## 4 101253805 Create and Make Arduino Workshop Participant COPUA
## 5 101253805 Create and Make Space Workshop Participant COPUA
## Status Descriptio Acad Plan
## 1 Active in Programme Certificate of Proficiency SCI-COPUA
## 2 Active in Programme Certificate of Proficiency SCI-COPUA
## 3 Active in Programme Certificate of Proficiency SCI-COPUA
## 4 Active in Programme Certificate of Proficiency SCI-COPUA
## 5 Active in Programme Certificate of Proficiency SCI-COPUA
## Plan Description Owner of Major/Spec/Module year
## 1 Science Science 2017
## 2 Science Science 2017
## 3 Science Science 2017
## 4 Science Science 2017
## 5 Science Science 2017
df_stud %>%
group_by(`Owner of Major/Spec/Module`) %>%
summarise(count=n()) %>%
ggplot() +
geom_bar(aes(x=reorder(`Owner of Major/Spec/Module`, count), count, fill=count), stat="identity") +
guides(fill=FALSE) +
coord_flip() +
ggtitle("Faculty split overall") +
theme_hc() +
scale_fill_continuous_tableau() + labs(x="")

# Lollipop chart
df_stud %>%
group_by(`programme`) %>%
summarise(count=n()) %>%
filter(!`programme` %in% c("CIE Participant", "Velocity Participant", "Unleash Space Participant", "Unleash Space Access", "Equipment Training Participant" )) %>%
arrange(count) %>%
mutate(programme = factor(`programme`, levels=.$programme)) %>%
ggplot(aes(count, programme, label=count, fill=count, color=-count)) +
geom_segment(aes(x=0, y=programme, xend=count, yend=programme)) +
geom_point(size=5) +
guides(fill=FALSE, color=FALSE) +
ggtitle("Programme split overall") +
theme_minimal() +
scale_fill_continuous_tableau() + labs(x="", y="")

#geom_text(nudge_x=10)
# Bar chart
df_stud %>%
group_by(`programme`) %>%
summarise(count=n()) %>%
filter(!`programme` %in% c("CIE Participant", "Velocity Participant", "Unleash Space Participant", "Unleash Space Access", "Equipment Training Participant" )) %>%
arrange(count) %>%
mutate(programme = factor(`programme`, levels=.$programme)) %>%
ggplot(aes(programme, count, label=count, fill=count)) +
geom_bar(stat="identity") +
coord_flip() +
guides(fill=FALSE, color=FALSE) +
ggtitle("Programme split overall") +
theme_minimal() +
scale_fill_continuous_tableau() + labs(x="", y="")

# Create dataframe for heatmap
df_stud %>%
select(`programme`, `Owner of Major/Spec/Module`) %>%
filter(!`programme` %in% c("CIE Participant", "Velocity Participant", "Unleash Space Participant", "Unleash Space Access", "Equipment Training Participant" )) %>%
group_by(`programme`,`Owner of Major/Spec/Module`) %>%
summarise(count=n()) %>%
complete(`Owner of Major/Spec/Module` = unique(df_stud$`Owner of Major/Spec/Module`)) %>%
ggplot(aes(`Owner of Major/Spec/Module`, `programme`)) +
geom_tile(aes(fill=count), colour="grey97") +
guides(color=FALSE) +
ggtitle("Programme split overall") +
scale_fill_gradient_tableau(na.value = "white") +
scale_x_discrete(position="top") +
#scale_fill_gradient(low="white", high = "steelblue", na.value="grey80") +
coord_fixed(ratio=.25) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust=0)) +
labs(x="", y="")
